library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(plotlyutils)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally) ; library(ggpubr)
library(Rtsne)
library(expss)
library(ClusterR)
library(DESeq2) ; library(biomaRt) ; library(limma)
library(knitr)
Load preprocessed dataset (preprocessing code in 01_data_preprocessing.Rmd)
# Gandal dataset
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
rownames(datExpr) = datGenes$ensembl_gene_id
datMeta = datMeta %>% mutate(ID = paste0('X',description))
# GO Neuronal annotations: regex 'neuron' in GO functional annotations and label the genes that make a match as neuronal
GO_annotations = read.csv('./../Data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>%
mutate('ID'=as.character(ensembl_gene_id)) %>%
dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
mutate('Neuronal'=1)
# Update DE_info with Neuronal information
DE_info = DE_info %>% mutate('ID'=datGenes$ensembl_gene_id) %>% left_join(GO_neuronal, by='ID') %>%
mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
dplyr::rename(log2FoldChange = 'logFC', padj = 'adj.P.Val') %>%
mutate(significant=padj<0.05 & !is.na(padj))
rm(GO_annotations)
plot_data = data.frame('ID'=rownames(datExpr), 'Mean'=rowMeans(datExpr))
p1 = plot_data %>% ggplot(aes(Mean)) + geom_density(color='#0099cc', fill='#0099cc', alpha=0.3) +
xlab('Mean Expression') + ylab('Density') + ggtitle('Mean Expression distribution by Gene') +
theme_minimal()
plot_data = data.frame('ID'=colnames(datExpr), 'Mean'=colMeans(datExpr))
p2 = plot_data %>% ggplot(aes(Mean)) + geom_density(color='#0099cc', fill='#0099cc', alpha=0.3) +
xlab('Mean Expression') + ylab('Density') +
theme_minimal() + ggtitle('Mean expression distribution by Sample')
grid.arrange(p1, p2, nrow=1)
rm(p1, p2, plot_data)
The differences in level of expression between Brain Region are not statistically significant
plot_data = data.frame('ID'=colnames(datExpr), 'Mean'=colMeans(datExpr)) %>%
left_join(datMeta, by='ID')
plot_data %>% ggplot(aes(Brain_lobe, Mean, fill = Brain_lobe)) +
geom_boxplot(outlier.colour='gray', outlier.shape='o', outlier.size=3) +
stat_compare_means(label = 'p.signif', method = 't.test', method.args = list(var.equal = FALSE)) +
xlab('Brain Lobe') + ylab('') + theme_minimal() + theme(legend.position = 'none')
The two groups of genes seem to be partially characterised by genes with Neuronal function
In general, the ASD group has a higher dispersion than the control group
Only the difference by neuronal function is statistically significant
plot_data = data.frame('ID'=rownames(datExpr), 'Mean'=rowMeans(datExpr)) %>%
left_join(GO_neuronal, by='ID') %>% mutate('Neuronal'=ifelse(is.na(Neuronal),F,T))
p1 = plot_data %>% ggplot(aes(Mean, color=Neuronal, fill=Neuronal)) + geom_density(alpha=0.3) +
scale_x_log10() + theme_minimal() + theme(legend.position='bottom') +
ggtitle('Mean expression by gene')
p3 = plot_data %>% ggplot(aes(Neuronal, Mean, fill = Neuronal)) +
geom_boxplot(outlier.colour='gray', outlier.shape='o', outlier.size=3) +
stat_compare_means(label = 'p.signif', method = 't.test',
method.args = list(var.equal = FALSE)) + theme_minimal() +
ylab('Mean Expression') + theme(legend.position = 'none')
plot_data = data.frame('ID'=colnames(datExpr), 'Mean'=colMeans(datExpr)) %>% left_join(datMeta, by='ID')
p2 = plot_data %>% ggplot(aes(Mean, color=Diagnosis, fill=Diagnosis)) + geom_density(alpha=0.3) +
theme_minimal() + theme(legend.position='bottom') +
ggtitle('Mean expression by Sample')
p4 = plot_data %>% ggplot(aes(Diagnosis, Mean, fill = Diagnosis)) +
geom_boxplot(outlier.colour='gray', outlier.shape='o', outlier.size=3) +
stat_compare_means(label = 'p.signif', method = 't.test',
method.args = list(var.equal = FALSE)) + theme_minimal() +
ylab('Mean Expression') + theme(legend.position = 'none')
grid.arrange(p1, p2, p3, p4, nrow=2)
rm(GO_annotations, plot_data, p1, p2, p3, p4)
In general there doesn’t seem to be a lot of variance in mean expression between autism and control samples by gene
plot_data = data.frame('ID'=rownames(datExpr),
'ASD'=rowMeans(datExpr[,datMeta$Diagnosis=='ASD']),
'CTL'=rowMeans(datExpr[,datMeta$Diagnosis!='ASD'])) %>%
mutate(diff=ASD-CTL, abs_diff = abs(ASD-CTL)) %>%
mutate(std_diff = (diff-mean(diff))/sd(diff), distance = abs((diff-mean(diff))/sd(diff)))
plot_data %>% ggplot(aes(ASD, CTL, color = distance)) + geom_point(alpha = plot_data$abs_diff) +
geom_abline(color = 'gray') + scale_color_viridis(direction = -1) +
ggtitle('Mean expression ASD vs CTL') + theme_minimal() + coord_fixed()
summary(plot_data$std_diff)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -12.764425 -0.493884 0.006659 0.000000 0.490468 9.009259
#cat(paste0('Outlier genes: ', paste(plot_data$ID[abs(plot_data$std_diff)>3], collapse = ', ')))
There are 225 genes with a difference between Diagnoses larger than 3 SD to the distance distribution of all genes. Gene ENSG00000129824 has the largest difference in mean expression between ASD and CTL, but it has a low level of expression, so it probably won’t be statistically significant (we confirm this in the Differential Expression Section)
There doesn’t seem to be a noticeable difference between mean expression by gene between Diagnosis groups
Samples with autism tend to have higher values than the control group (as we had already seen above)
plot_data = rbind(data.frame('Mean'=rowMeans(datExpr[,datMeta$Diagnosis=='ASD']), 'Diagnosis'='ASD'),
data.frame('Mean'=rowMeans(datExpr[,datMeta$Diagnosis!='ASD']), 'Diagnosis'='CTL')) %>%
mutate('Diagnosis'=factor(Diagnosis, levels=c('CTL','ASD')))
p1 = ggplotly(plot_data %>% ggplot(aes(Mean, color=Diagnosis, fill=Diagnosis)) +
geom_density(alpha=0.3) + scale_x_log10() + theme_minimal())
plot_data = rbind(data.frame('Mean'=colMeans(datExpr[,datMeta$Diagnosis=='ASD']), 'Diagnosis'='ASD'),
data.frame('Mean'=colMeans(datExpr[,datMeta$Diagnosis!='ASD']), 'Diagnosis'='CTL')) %>%
mutate('Diagnosis'=factor(Diagnosis, levels=c('CTL','ASD')))
p2 = ggplotly(plot_data %>% ggplot(aes(Mean, color=Diagnosis, fill=Diagnosis)) +
geom_density(alpha=0.3) + theme_minimal() +
ggtitle('Mean expression by Gene (left) and by Sample (right) grouped by Diagnosis'))
subplot(p1, p2, nrows=1)
rm(p1, p2, plot_data)
The first principal component seems to separate relatively well the two Diagnosis
pca = datExpr %>% t %>% prcomp
plot_data = data.frame('ID'=colnames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
left_join(datMeta, by='ID') %>% dplyr::select('ID','PC1','PC2','Diagnosis') %>%
mutate('Diagnosis'=factor(Diagnosis, levels=c('CTL','ASD')))
plot_data %>% ggplot(aes(PC1, PC2, color=Diagnosis)) + geom_point(alpha = 0.8) +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) +
theme_minimal() + ggtitle('PCA of Samples')
rm(pca, plot_data)
Looks exactly the same as the PCA visualisation, just inverting the both axes
d = datExpr %>% t %>% dist
fit = cmdscale(d, k=2)
plot_data = data.frame('ID'=colnames(datExpr), 'C1'=fit[,1], 'C2'=fit[,2]) %>% left_join(datMeta, by='ID') %>%
dplyr::select('C1','C2','Diagnosis') %>%
mutate('Diagnosis'=factor(Diagnosis, levels=c('CTL','ASD')))
plot_data %>% ggplot(aes(C1, C2, color=Diagnosis)) + geom_point(alpha = 0.8) + theme_minimal() + ggtitle('MDS')
rm(d, fit, plot_data)
t-SNE manages to group samples by diagnosis with more than one perplexity
perplexities = c(1,3,5,8)
ps = list()
for(i in 1:length(perplexities)){
set.seed(123)
tsne = datExpr %>% t %>% Rtsne(perplexity=perplexities[i])
plot_data = data.frame('ID'=colnames(datExpr), 'C1'=tsne$Y[,1], 'C2'=tsne$Y[,2]) %>%
left_join(datMeta, by='ID') %>%
dplyr::select('C1','C2','Diagnosis','Subject_ID') %>%
mutate('Diagnosis'=factor(Diagnosis, levels=c('CTL','ASD')))
ps[[i]] = plot_data %>% ggplot(aes(C1, C2, color=Diagnosis)) + geom_point() + theme_minimal() +
ggtitle(paste0('Perplexity=',perplexities[i])) + theme(legend.position='none')
}
grid.arrange(grobs=ps, nrow=2)
rm(ps, perplexities, tsne, i)
In the Gandal dataset, the higher perplexity values managed to capture the subject the samples belonged to, it seems to do the same here, perhaps even clearer
ggplotly(plot_data %>% ggplot(aes(C1, C2, color=Subject_ID)) + geom_point(aes(id=Subject_ID)) + theme_minimal() +
theme(legend.position='none') + ggtitle('t-SNE Perplexity=30 coloured by Subject ID'))
rm(plot_data)
The First Principal Component explains over 99% of the total variance
There’s a really strong correlation between the mean expression of a gene and the 1st principal component
pca = datExpr %>% prcomp
plot_data = data.frame( 'PC1' = pca$x[,1], 'PC2' = pca$x[,2], 'MeanExpr'=rowMeans(datExpr))
plot_data %>% ggplot(aes(PC1, PC2, color=MeanExpr)) + geom_point(alpha=0.3) + theme_minimal() +
scale_color_viridis() + ggtitle('PCA') +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)'))
rm(pca, plot_data)
Higher perplexities capture a cleaner visualisation of the data ordered by mean expression, in a similar (although not as linear) way to PCA
perplexities = c(1,2,5,10,50,100)
ps = list()
for(i in 1:length(perplexities)){
tsne = read.csv(paste0('./../Visualisations/tsne_perplexity_',perplexities[i],'.csv'))
plot_data = data.frame('C1'=tsne[,1], 'C2'=tsne[,2], 'MeanExpr'=rowMeans(datExpr))
ps[[i]] = plot_data %>% ggplot(aes(C1, C2, color=MeanExpr)) + geom_point(alpha=0.5) + theme_minimal() +
scale_color_viridis() + ggtitle(paste0('Perplexity = ',perplexities[i])) + theme(legend.position='none')
}
grid.arrange(grobs=ps, nrow=2)
rm(perplexities, ps, tsne, i)
table(DE_info$padj<0.05, useNA='ifany')
##
## FALSE TRUE
## 13066 2449
p = DE_info %>% ggplot(aes(log2FoldChange, padj, color=significant)) + geom_point(alpha=0.2) +
scale_y_sqrt() + xlab('Log Fold Change') + ylab('Adjusted p-value') + theme_minimal()
ggExtra::ggMarginal(p, type = 'density', color='gray', fill='gray', size=10)
rm(p)
plot_data = data.frame('ID'=rownames(datExpr), 'meanExpr'=rowMeans(datExpr)) %>% left_join(DE_info, by='ID') %>%
mutate('statisticallySignificant' = ifelse(is.na(padj),NA, ifelse(padj<0.05, TRUE, FALSE)),
'alpha' = ifelse(padj>0.05 | is.na(padj), 0.1, 0.5))
plot_data %>% ggplot(aes(meanExpr, abs(log2FoldChange), color=statisticallySignificant)) +
geom_point(alpha = plot_data$alpha) + geom_smooth(method='lm') +
theme_minimal() + scale_y_sqrt() + theme(legend.position = 'bottom') +
xlab('Mean Expression') + ylab('LFC Magnitude') +
ggtitle('Log fold change by level of expression')
When filtering for differential expression, the points separate into two clouds depending on whether they are over- or under-expressed
The top cloud corresponds to the under-expressed genes and the bottom to the over-expressed ones
datExpr_DE = datExpr[DE_info$significant,]
pca = datExpr_DE %>% prcomp
plot_data = cbind(data.frame('PC1'=pca$x[,1], 'PC2'=pca$x[,2]), DE_info[DE_info$significant==TRUE,])
pos_zero = -min(plot_data$log2FoldChange)/(max(plot_data$log2FoldChange)-min(plot_data$log2FoldChange))
p = plot_data %>% ggplot(aes(PC1, PC2, color=log2FoldChange)) + geom_point(alpha=0.5) +
scale_color_gradientn(colours=c('#F8766D','#faa49e','white','#00BFC4','#009499'),
values=c(0, pos_zero-0.05, pos_zero, pos_zero+0.05, 1)) +
theme_minimal() + ggtitle('
PCA of differentially expressed genes') + # This is on purpose, PDF doesn't save well without this white space (?)
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) +
theme(legend.position = 'bottom')
ggExtra::ggMarginal(p, type='density', color='gray', fill='gray', size=10)
rm(pos_zero, p)
Separating the genes into these two groups: Salmon: under-expressed, aqua: over-expressed
plot_data = plot_data %>% mutate('Group'=ifelse(log2FoldChange>0,'overexpressed','underexpressed')) %>%
mutate('Group' = factor(Group, levels=c('underexpressed','overexpressed')))
List of top DE genes
# Get genes names
getinfo = c('ensembl_gene_id','external_gene_id')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl', host='feb2014.archive.ensembl.org')
gene_names = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=plot_data$ID, mart=mart) %>%
rename(external_gene_id = 'gene_name', ensembl_gene_id = 'ID')
## Batch submitting query [==================>------------] 60% eta: 0s Batch
## submitting query [========================>------] 80% eta: 0s Batch submitting
## query [===============================] 100% eta: 0s
top_genes = plot_data %>% left_join(gene_names, by='ID') %>% arrange(-abs(log2FoldChange)) %>%
top_n(50, wt=log2FoldChange)
kable(top_genes %>% dplyr::select(ID, gene_name, log2FoldChange, padj, Neuronal, Group))
| ID | gene_name | log2FoldChange | padj | Neuronal | Group |
|---|---|---|---|---|---|
| ENSG00000173915 | USMG5 | 0.7612407 | 0.0014621 | 0 | overexpressed |
| ENSG00000182103 | FAM181B | 0.6627976 | 0.0000757 | 0 | overexpressed |
| ENSG00000196136 | SERPINA3 | 0.6095301 | 0.0430646 | 0 | overexpressed |
| ENSG00000073756 | PTGS2 | 0.5214530 | 0.0090993 | 1 | overexpressed |
| ENSG00000106211 | HSPB1 | 0.5163140 | 0.0001058 | 0 | overexpressed |
| ENSG00000101327 | PDYN | 0.5029224 | 0.0004849 | 0 | overexpressed |
| ENSG00000158373 | HIST1H2BD | 0.4482958 | 0.0000048 | 0 | overexpressed |
| ENSG00000175206 | NPPA | 0.4440241 | 0.0005972 | 0 | overexpressed |
| ENSG00000124762 | CDKN1A | 0.4355518 | 0.0070656 | 0 | overexpressed |
| ENSG00000108691 | CCL2 | 0.4305914 | 0.0163803 | 1 | overexpressed |
| ENSG00000204219 | TCEA3 | 0.4305223 | 0.0000474 | 0 | overexpressed |
| ENSG00000163520 | FBLN2 | 0.4182056 | 0.0000399 | 0 | overexpressed |
| ENSG00000114200 | BCHE | 0.4179747 | 0.0004748 | 0 | overexpressed |
| ENSG00000132002 | DNAJB1 | 0.4176425 | 0.0164132 | 0 | overexpressed |
| ENSG00000012963 | UBR7 | 0.4127440 | 0.0000158 | 0 | overexpressed |
| ENSG00000173486 | FKBP2 | 0.4124107 | 0.0006110 | 0 | overexpressed |
| ENSG00000105697 | HAMP | 0.4089664 | 0.0489586 | 0 | overexpressed |
| ENSG00000180573 | HIST1H2AC | 0.3883532 | 0.0000066 | 0 | overexpressed |
| ENSG00000136235 | GPNMB | 0.3875091 | 0.0167178 | 0 | overexpressed |
| ENSG00000101298 | SNPH | 0.3873132 | 0.0001051 | 1 | overexpressed |
| ENSG00000159189 | C1QC | 0.3855800 | 0.0007581 | 0 | overexpressed |
| ENSG00000221869 | CEBPD | 0.3805648 | 0.0010027 | 0 | overexpressed |
| ENSG00000134531 | EMP1 | 0.3798648 | 0.0191595 | 0 | overexpressed |
| ENSG00000184678 | HIST2H2BE | 0.3754434 | 0.0000087 | 0 | overexpressed |
| ENSG00000134853 | PDGFRA | 0.3710305 | 0.0031163 | 0 | overexpressed |
| ENSG00000026559 | KCNG1 | 0.3661942 | 0.0358647 | 0 | overexpressed |
| ENSG00000204347 | BTBD17 | 0.3633140 | 0.0081243 | 0 | overexpressed |
| ENSG00000197019 | SERTAD1 | 0.3611023 | 0.0017659 | 0 | overexpressed |
| ENSG00000103260 | METRN | 0.3610544 | 0.0004111 | 0 | overexpressed |
| ENSG00000177707 | PVRL3 | 0.3541528 | 0.0004741 | 0 | overexpressed |
| ENSG00000197903 | HIST1H2BK | 0.3541393 | 0.0000004 | 0 | overexpressed |
| ENSG00000149257 | SERPINH1 | 0.3534902 | 0.0206716 | 0 | overexpressed |
| ENSG00000113742 | CPEB4 | 0.3426404 | 0.0065112 | 1 | overexpressed |
| ENSG00000164253 | WDR41 | 0.3389893 | 0.0052486 | 0 | overexpressed |
| ENSG00000126767 | ELK1 | 0.3337928 | 0.0001943 | 0 | overexpressed |
| ENSG00000189306 | RRP7A | 0.3335859 | 0.0002759 | 0 | overexpressed |
| ENSG00000162623 | TYW3 | 0.3311540 | 0.0003521 | 0 | overexpressed |
| ENSG00000069667 | RORA | 0.3291238 | 0.0024389 | 0 | overexpressed |
| ENSG00000160049 | DFFA | 0.3285121 | 0.0000012 | 0 | overexpressed |
| ENSG00000134086 | VHL | 0.3280740 | 0.0001907 | 0 | overexpressed |
| ENSG00000170458 | CD14 | 0.3219690 | 0.0288669 | 0 | overexpressed |
| ENSG00000134769 | DTNA | 0.3157405 | 0.0096716 | 0 | overexpressed |
| ENSG00000150551 | LYPD1 | 0.3156217 | 0.0033662 | 0 | overexpressed |
| ENSG00000130208 | APOC1 | 0.3123769 | 0.0430646 | 0 | overexpressed |
| ENSG00000137193 | PIM1 | 0.3120680 | 0.0001198 | 0 | overexpressed |
| ENSG00000154478 | GPR26 | 0.3117004 | 0.0027446 | 0 | overexpressed |
| ENSG00000177606 | JUN | 0.3112235 | 0.0002654 | 1 | overexpressed |
| ENSG00000074410 | CA12 | 0.3099400 | 0.0008752 | 0 | overexpressed |
| ENSG00000144566 | RAB5A | 0.3085956 | 0.0002759 | 0 | overexpressed |
| ENSG00000106624 | AEBP1 | 0.3078414 | 0.0007859 | 0 | overexpressed |
rm(top_genes)
Plotting the mean expression by group in Gandal’s dataset there seemed to exist underlying distributions, so we would use GMM to separate them, but everything seems very homogeneous here, so this doesn’t seem to be necessary. (If we do it anyway we can see that they still cluster by mean expression, which makes sense since it explains the majority of the variance of the genes)
gg_colour_hue = function(n) {
hues = seq(15, 375, length = n + 1)
hcl(h = hues, l = 65, c = 100)[1:n]
}
tot_n_clusters = 4
plot_data = plot_data %>% mutate('MeanExpr'=rowMeans(datExpr_DE), 'SDExpr'=apply(datExpr_DE,1,sd))
GMM_G1 = plot_data %>% filter(Group=='overexpressed') %>% dplyr::select(MeanExpr) %>% GMM(2)
GMM_G2 = plot_data %>% filter(Group=='underexpressed') %>% dplyr::select(MeanExpr) %>% GMM(2)
memberships_G1 = data.frame('ID'=plot_data$ID[plot_data$Group=='overexpressed'],
'Membership'=GMM_G1$Log_likelihood %>%
apply(1, function(x) glue('over_', which.max(x))))
memberships_G2 = data.frame('ID'=plot_data$ID[plot_data$Group=='underexpressed'],
'Membership'=GMM_G2$Log_likelihood %>%
apply(1, function(x) glue('under_', which.max(x))))
plot_data = rbind(memberships_G1, memberships_G2) %>% left_join(plot_data, by='ID')
p1 = plot_data %>% ggplot(aes(x=MeanExpr, color=Group, fill=Group)) + geom_density(alpha=0.4) +
theme_minimal() + theme(legend.position='bottom')
p2 = plot_data %>% ggplot(aes(x=Group, y=MeanExpr, fill=Group)) + ylab('Mean Expression') + xlab('') +
geom_boxplot(outlier.colour='gray', outlier.shape='o', outlier.size=3) +
stat_compare_means(label = 'p.signif', method = 't.test', method.args = list(var.equal = FALSE)) +
theme_minimal() + theme(legend.position='none')
p3 = plot_data %>% ggplot(aes(x=MeanExpr)) + ylab('Density') +
stat_function(fun=dnorm, n=100, colour=gg_colour_hue(tot_n_clusters)[1],
args=list(mean=GMM_G1$centroids[1], sd=GMM_G1$covariance_matrices[1])) +
stat_function(fun=dnorm, n=100, colour=gg_colour_hue(tot_n_clusters)[2],
args=list(mean=GMM_G1$centroids[2], sd=GMM_G1$covariance_matrices[2])) +
stat_function(fun=dnorm, n=100, colour=gg_colour_hue(tot_n_clusters)[3],
args=list(mean=GMM_G1$centroids[3], sd=GMM_G1$covariance_matrices[3])) +
stat_function(fun=dnorm, n=100, colour=gg_colour_hue(tot_n_clusters)[4],
args=list(mean=GMM_G2$centroids[1], sd=GMM_G2$covariance_matrices[1])) +
theme_minimal()
p4 = plot_data %>% ggplot(aes(PC1, PC2, color=Membership)) + geom_point(alpha=0.4) + theme_minimal() +
theme(legend.position='bottom')
grid.arrange(p1, p2, p3, p4, nrow=2)
rm(gg_color_hue, n_clusters, GMM_G1, GMM_G2, memberships_G1, memberships_G2, p1, p2, p3, tot_n_clusters)
For previous preprocessing pipelines, the pattern found above was also present in the standard deviation, but there doesn’t seem to be any strong patterns now. This could be because the variance was almost homogenised with the vst normalisation algorithm.
plot_data %>% ggplot(aes(x=SDExpr, color=Group, fill=Group)) + geom_density(alpha=0.4) +
theme_minimal() + theme(legend.position = 'bottom')
rm(plot_data)
fc_list = seq(1, 1.4, 0.01)
n_genes = nrow(datExpr)
# Calculate PCAs
datExpr_pca_samps = datExpr %>% data.frame %>% t %>% prcomp(scale.=TRUE)
datExpr_pca_genes = datExpr %>% data.frame %>% prcomp(scale.=TRUE)
# Initialise DF to save PCA outputs
pcas_samps = datExpr_pca_samps$x %>% data.frame %>% dplyr::select(PC1:PC2) %>%
mutate('ID'=colnames(datExpr), 'fc'=-1, PC1=scale(PC1), PC2=scale(PC2))
pcas_genes = datExpr_pca_genes$x %>% data.frame %>% dplyr::select(PC1:PC2) %>%
mutate('ID'=rownames(datExpr), 'fc'=-1, PC1=scale(PC1), PC2=scale(PC2))
pca_samps_old = pcas_samps
pca_genes_old = pcas_genes
for(fc in fc_list){
# Recalculate DE_info with the new threshold (p-values change) an filter DE genes
DE_genes = topTable(efit, coef=2, number=Inf, sort.by='none', lfc = log2(fc)) %>% data.frame %>%
left_join(data.frame('ID' = rownames(datExpr), 'AveExpr' = rowMeans(datExpr)), by = 'AveExpr') %>%
mutate(padj = adj.P.Val) %>% filter(padj<0.05)
datExpr_DE = datExpr %>% data.frame %>% filter(rownames(.) %in% DE_genes$ID)
n_genes = c(n_genes, nrow(DE_genes))
# Calculate PCAs
datExpr_pca_samps = datExpr_DE %>% t %>% prcomp(scale.=TRUE)
datExpr_pca_genes = datExpr_DE %>% prcomp(scale.=TRUE)
# Create new DF entries
pca_samps_new = datExpr_pca_samps$x %>% data.frame %>% dplyr::select(PC1:PC2) %>%
mutate('ID'=colnames(datExpr), 'fc'=fc, PC1=scale(PC1), PC2=scale(PC2))
pca_genes_new = datExpr_pca_genes$x %>% data.frame %>% dplyr::select(PC1:PC2) %>%
mutate('ID'=DE_genes$ID, 'fc'=fc, PC1=scale(PC1), PC2=scale(PC2))
# Change PC sign if necessary
if(cor(pca_samps_new$PC1, pca_samps_old$PC1)<0) pca_samps_new$PC1 = -pca_samps_new$PC1
if(cor(pca_samps_new$PC2, pca_samps_old$PC2)<0) pca_samps_new$PC2 = -pca_samps_new$PC2
if(cor(pca_genes_new[pca_genes_new$ID %in% pca_genes_old$ID,]$PC1,
pca_genes_old[pca_genes_old$ID %in% pca_genes_new$ID,]$PC1)<0){
pca_genes_new$PC1 = -pca_genes_new$PC1
}
if(cor(pca_genes_new[pca_genes_new$ID %in% pca_genes_old$ID,]$PC2,
pca_genes_old[pca_genes_old$ID %in% pca_genes_new$ID,]$PC2 )<0){
pca_genes_new$PC2 = -pca_genes_new$PC2
}
pca_samps_old = pca_samps_new
pca_genes_old = pca_genes_new
# Update DFs
pcas_samps = rbind(pcas_samps, pca_samps_new)
pcas_genes = rbind(pcas_genes, pca_genes_new)
}
# Add Diagnosis/SFARI score information
pcas_samps = pcas_samps %>% left_join(datMeta, by='ID') %>%
dplyr::select(ID, PC1, PC2, fc, Diagnosis, Brain_lobe)
# Plot change of number of genes
ggplotly(data.frame('lfc'=log2(fc_list), 'n_genes'=n_genes[-1]) %>% ggplot(aes(x=lfc, y=n_genes)) +
geom_point() + geom_line() + theme_minimal() + xlab('Log Fold Change Threshold') + ylab('DE Genes') +
ggtitle('Number of Differentially Expressed genes when modifying filtering threshold'))
rm(fc_list, n_genes, fc, pca_samps_new, pca_genes_new, pca_samps_old, pca_genes_old,
datExpr_pca_samps, datExpr_pca_genes)
Note: PC values get smaller as LFC increases, so on each iteration the values were scaled so it would be easier to compare between frames
LFC = -1 represents the whole set of genes, without any filtering by differential expression
The LFC threshold doesn’t seem to make a big difference for differentiating genes by Diagnosis
ggplotly(pcas_samps %>% mutate(abs_lfc=ifelse(fc==-1,-1,round(log2(fc),2))) %>%
ggplot(aes(PC1, PC2, color=Diagnosis)) + geom_point(aes(frame=abs_lfc, ids=ID), alpha=0.7) +
theme_minimal() + ggtitle('Samples PCA plot modifying filtering threshold'))
There doesn’t seem to be a strong recognisable pattern
ggplotly(pcas_samps %>% mutate(abs_lfc=ifelse(fc==-1,-1,round(log2(fc),2))) %>%
ggplot(aes(PC1, PC2, color=Brain_lobe)) + geom_point(aes(frame=abs_lfc, ids=ID), alpha = 0.7) +
theme_minimal() + ggtitle('Samples PCA plot modifying filtering threshold'))
if(!'fcSign' %in% colnames(pcas_genes)){
pcas_genes = pcas_genes %>% left_join(DE_info, by='ID') %>%
mutate(LFCSign = ifelse(log2FoldChange>0,'Positive','Negative'))
}
ggplotly(pcas_genes %>% mutate(abs_lfc=ifelse(fc==-1,-1,round(log2(fc),2))) %>%
ggplot(aes(PC1, PC2, color=LFCSign)) + geom_point(aes(frame=abs_lfc, ids=ID), alpha=0.2) +
theme_minimal() + ggtitle('Genes PCA plot modifying LFC Magnitude filtering threshold'))